library(ggplot2)
library(reshape2)
library(dplyr)
library(tidyverse)

Variable

Dependent variable: MICHD

Independent variables: all other

Data Prep

michd = read.csv("MICHD.csv")
str(michd)
## 'data.frame':    184877 obs. of  20 variables:
##  $ Urban           : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ PhysicalHealth  : num  0 0 0 0 0 ...
##  $ MentalHealth    : num  0 1.79 0 0 1.61 ...
##  $ PhysicalActivity: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Asthma          : int  3 3 3 3 1 3 3 3 3 3 ...
##  $ Arthritis       : int  0 0 0 0 1 0 0 1 0 1 ...
##  $ Sex             : int  0 0 1 0 0 1 1 1 1 0 ...
##  $ Race            : int  1 1 1 1 1 4 1 1 1 1 ...
##  $ Age             : int  2 3 5 5 6 3 6 5 5 4 ...
##  $ BMI             : num  25.6 29.5 31 35.4 25.7 ...
##  $ Education       : int  4 4 3 3 2 4 4 4 4 1 ...
##  $ Smoke           : int  4 4 4 4 4 1 3 4 4 4 ...
##  $ Alcohol         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ HHADULT         : num  0.693 0.693 0.693 0.693 0.693 ...
##  $ HIV             : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ Fruit           : num  0.00693 0.00358 0.00405 0.00451 0.01418 ...
##  $ Vegetable       : num  1.061 0.963 1.051 0.871 1.188 ...
##  $ Cholesterol     : int  0 0 1 0 1 0 1 1 1 0 ...
##  $ Stroke          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ MICHD           : int  0 0 0 0 0 0 1 0 0 0 ...
col_factor = c("Urban", "PhysicalActivity","Asthma","Arthritis","Sex","Race","Age",
              "Education","Alcohol", "Smoke","HIV","Cholesterol", "Stroke", "MICHD")
michd[col_factor] = lapply(michd[col_factor], factor)
str(michd)
## 'data.frame':    184877 obs. of  20 variables:
##  $ Urban           : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 2 2 2 2 ...
##  $ PhysicalHealth  : num  0 0 0 0 0 ...
##  $ MentalHealth    : num  0 1.79 0 0 1.61 ...
##  $ PhysicalActivity: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Asthma          : Factor w/ 3 levels "1","2","3": 3 3 3 3 1 3 3 3 3 3 ...
##  $ Arthritis       : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 2 1 2 ...
##  $ Sex             : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 2 2 2 1 ...
##  $ Race            : Factor w/ 8 levels "1","2","3","4",..: 1 1 1 1 1 4 1 1 1 1 ...
##  $ Age             : Factor w/ 6 levels "1","2","3","4",..: 2 3 5 5 6 3 6 5 5 4 ...
##  $ BMI             : num  25.6 29.5 31 35.4 25.7 ...
##  $ Education       : Factor w/ 4 levels "1","2","3","4": 4 4 3 3 2 4 4 4 4 1 ...
##  $ Smoke           : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 1 3 4 4 4 ...
##  $ Alcohol         : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ HHADULT         : num  0.693 0.693 0.693 0.693 0.693 ...
##  $ HIV             : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 2 ...
##  $ Fruit           : num  0.00693 0.00358 0.00405 0.00451 0.01418 ...
##  $ Vegetable       : num  1.061 0.963 1.051 0.871 1.188 ...
##  $ Cholesterol     : Factor w/ 2 levels "0","1": 1 1 2 1 2 1 2 2 2 1 ...
##  $ Stroke          : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ MICHD           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
# Density Plot
michd.num <- michd %>% select(c("PhysicalHealth", "MentalHealth", "BMI", "HHADULT", "Fruit", "Vegetable", "MICHD"))

ggplot(melt(michd.num), aes(x = value, fill = MICHD)) +
  geom_density(alpha = 0.5) +
  facet_wrap( ~ variable, scale = "free")

# Conditional density plots
a <- michd %>%
  ggplot() +
  geom_histogram(aes(x=PhysicalHealth,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('Number of Days Physical Health Not Good in past 30 Days') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))

b <- michd %>%
  ggplot() +
  geom_histogram(aes(x=MentalHealth,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('Number of Days Mental Health Not Good in past 30 Days') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))

c <- michd %>%
  ggplot() +
  geom_histogram(aes(x=BMI,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('BMI') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))



d <- michd %>%
  ggplot() +
  geom_histogram(aes(x=HHADULT,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('HHADULT') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))


e <- michd %>%
  ggplot() +
  geom_histogram(aes(x=Fruit,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('Total fruits consumed per day') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))

f <- michd %>%
  ggplot() +
  geom_histogram(aes(x=Vegetable,
                     fill=(MICHD==1)),
                 position='fill') +  
  theme_bw() +
  xlab('Total Vegatables consumed per day') +
  ylab('Frequency') +
  theme(axis.title=element_text(size=14), 
        axis.text=element_text(size=14), 
        legend.text=element_text(size=14)) +
  scale_fill_manual(name='', 
                    labels=c('No MICHD', 'Had MICHD'), 
                    values=c('grey', 'red'))


library(gridExtra)
grid.arrange(a,b,c,d,e,f, ncol = 2, nrow = 3)

Correlation Plot

# Select numeric variables
michd.cor = michd  %>% select(c("PhysicalHealth", "MentalHealth", 
                              "BMI", "HHADULT", "Fruit", "Vegetable"))

michd.cor$Age.rank <- rank(michd$Age)
michd.cor$Education.rank <- rank(michd$Education)
michd.cor$Smoke.rank <- rank(michd$Smoke)

c = cor(michd.cor, method = "spearman")

library(corrplot)
corrplot(c, addCoef.col = "red", tl.cex = 0.8)

Clustering

set.seed(123)
michd_matrix_c <- model.matrix(~ . - 1, data = michd[, colnames(michd)])

michd_matrix_c[,'BMI'] <- scale(michd_matrix_c[,'BMI'])
michd_matrix_c[,'HHADULT'] <- scale(michd_matrix_c[,'HHADULT'])
michd_matrix_c[,'Fruit'] <- scale(michd_matrix_c[,'Fruit'])
michd_matrix_c[,'Vegetable'] <- scale(michd_matrix_c[,'Vegetable'])
set.seed(123)
twcv = function(k) kmeans(michd_matrix_c, k, nstart = 25)$tot.withinss
k <- 1:15
twcv_values <- sapply(k, twcv)
plot(twcv_values, type = 'b', pch = 19)

library(factoextra)
k6 <- kmeans(michd_matrix_c, centers = 6, nstart = 25)
fviz_cluster(k6, geom = "point", data = michd_matrix_c)

k8 <- kmeans(michd_matrix_c, centers = 8, nstart = 25)
fviz_cluster(k8, geom = "point", data = michd_matrix_c)

k2 <- kmeans(michd_matrix_c, centers = 2, nstart = 25)
fviz_cluster(k2, geom = "point", data = michd_matrix_c)

PCA

pca = prcomp(michd_matrix_c)
fviz_pca_biplot(pca,repel = TRUE, col.var = "red", col.ind = "white", label = c("var"))